{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import vk\n",
    "import time\n",
    "from vk.exceptions import VkAPIError\n",
    "from requests.exceptions import ConnectionError \n",
    "from requests.exceptions import ReadTimeout \n",
    "import pandas as pd\n",
    "import random\n",
    "import datetime\n",
    "import numpy as np\n",
    "#global current_token \n",
    "from scipy.sparse import csr_matrix, lil_matrix\n",
    "import scipy.sparse as sprs\n",
    "import sys\n",
    "import gc\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download data to encode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dem = np.load('dem.npy')\n",
    "sub = sprs.load_npz('publ.npz')\n",
    "\n",
    "dem = pd.DataFrame(dem)\n",
    "dem[[0, 1, 2]] = dem[[0, 1, 2]].apply(pd.to_numeric)\n",
    "\n",
    "dem = dem.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download feature space "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "subscriptions = np.load('encoded_data/subscriptions.npy')\n",
    "\n",
    "index = dict((subscriptions[i],i) for i in range(len(subscriptions)))\n",
    "\n",
    "political = np.load('encoded_data/political.npy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode subscriptions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_sub= lil_matrix((sub.shape[0], subscriptions.shape[0]), dtype=np.float16)\n",
    "j = 0 \n",
    "for i in tqdm(range(len(publ.data))):\n",
    "    while sub.indptr[j+1] < i+1:\n",
    "        j = j + 1\n",
    "    try:\n",
    "        X_sub[j, index[sub.data[i]]] = 1.0\n",
    "    except KeyError:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_sub = X_sub.tocsr()\n",
    "sprs.save_npz('X_sub_new.npz', X_sub)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode political preferences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_political = lil_matrix((dem.shape[0], political.shape[0]), dtype=np.float16)\n",
    "\n",
    "for i in (range(dem.shape[0]):\n",
    "    pref = dem[i, 2]\n",
    "    for j in range(political.shape[0]): \n",
    "        if pref == political[j]:\n",
    "            X_political[i, j] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_political = X_political.tocsr()\n",
    "sprs.save_npz('X_political_new.npz', X_political)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
